In [1]:
import pandas as pd
import numpy as np
import datetime
import plotly.express as px
import copy
In [2]:
import sys
print("Python version")
print (sys.version)
Python version
3.7.9 (default, Aug 31 2020, 17:10:11) [MSC v.1916 64 bit (AMD64)]
In [3]:
attendance_dataset=pd.read_csv('required dataset/attendance.csv')
matches=pd.read_csv('required dataset/matches.csv')
stadiums=pd.read_csv('required dataset/stadiums.csv')
players=pd.read_csv('required dataset/players.csv')
squads=pd.read_csv('required dataset/squads.csv')
tournaments=pd.read_csv("required dataset/tournaments.csv")
goals=pd.read_csv("required dataset/goals.csv")
In [4]:
matches.rename(columns={'home_team_name':'home_team','away_team_name':'away_team','match_date':'Date'},inplace=True)

1_1¶

Get The Diifferent Labels in each table Using Set()¶

for Matches Table¶

In [5]:
home_teams_matches=matches.home_team.unique().tolist()
In [6]:
away_teams_matches=matches.away_team.unique().tolist()
In [7]:
all_teams_matches=copy.deepcopy(home_teams_matches)
In [8]:
all_teams_matches.extend(away_teams_matches)
In [9]:
all_teams_matches_set=set(all_teams_matches)

for Attendance Table¶

In [10]:
home_teams_attendance=attendance_dataset.home_team.unique().tolist()
In [11]:
away_teams_attendance=attendance_dataset.away_team.unique().tolist()
In [12]:
all_teams_attendance=copy.deepcopy(home_teams_attendance)
In [13]:
all_teams_attendance.extend(away_teams_attendance)
In [14]:
all_teams_attendance_set=set(all_teams_attendance)

Different teams_names¶

In [15]:
different_teams_attendance=all_teams_attendance_set-all_teams_matches_set
different_teams_attendance
Out[15]:
{'China PR',
 "Côte d'Ivoire",
 'FR Yugoslavia',
 'Germany DR',
 'IR Iran',
 'Korea DPR',
 'Korea Republic',
 'Türkiye'}
In [16]:
different_teams_matches=all_teams_matches_set-all_teams_attendance_set
list(different_teams_matches)
Out[16]:
['Ivory Coast',
 'North Korea',
 'Iran',
 'East Germany',
 'South Korea',
 'Turkey',
 'China']
In [17]:
correct_name_dict={
    "China PR":"China",
    "Côte d'Ivoire":"Ivory Coast",
    "FR Yugoslavia":"Yugoslavia",
    "Germany DR":"East Germany",
    "IR Iran":"Iran",
    "Korea DPR":"North Korea",
    "Korea Republic":"South Korea",
    "Türkiye":"Turkey",
}

Fix Different names in the two tables¶

In [18]:
correct_attendance_home_teams=[]
correct_attendance_away_teams=[]
correct_matches_home_teams=[]
correct_matches_away_teams=[]
seperaion_germany_date= datetime.datetime.strptime("1950-01-01", '%Y-%m-%d').date()
union_germany_date = datetime.datetime.strptime("1994-01-01", '%Y-%m-%d').date()
In [19]:
def correct_attendance_name(lables):
    home_team=lables[0]
    away_team=lables[1]
    date=lables[2]
    d = datetime.datetime.strptime(date, '%Y-%m-%d').date()
    
    if home_team in correct_name_dict:
        home_team=correct_name_dict[home_team]
        
    elif home_team =="Germany" and d < union_germany_date and d > seperaion_germany_date:
        home_team="West Germany"
        
    if away_team in correct_name_dict:
        away_team=correct_name_dict[away_team]
        
    elif away_team =="Germany" and d < union_germany_date and d > seperaion_germany_date:
        away_team="West Germany"
    
    correct_attendance_home_teams.append(home_team)
    correct_attendance_away_teams.append(away_team)    
In [20]:
attendance_dataset[['home_team','away_team','Date']].apply(correct_attendance_name,axis=1)
Out[20]:
0      None
1      None
2      None
3      None
4      None
       ... 
959    None
960    None
961    None
962    None
963    None
Length: 964, dtype: object
In [21]:
attendance_dataset['home_team']=correct_attendance_home_teams
attendance_dataset['away_team']=correct_attendance_away_teams

Merge Stadiums and Attendance with Matches table¶

In [22]:
data=pd.merge(matches,stadiums[['stadium_capacity','stadium_id']],on='stadium_id')
In [23]:
data.insert(15, 'stadium_capacity', data.pop('stadium_capacity'))
In [24]:
def fill_attendence(match):
    attendance=attendance_dataset.loc[((attendance_dataset.home_team==match[0]) | (attendance_dataset.away_team==match[0])) & (attendance_dataset.Date==match[2])]["Attendance"]
    return int(attendance)
In [25]:
attendance_values=data[['home_team','away_team','Date']].apply(fill_attendence,axis=1)
In [26]:
data['Attendance']=attendance_values

Check if the sample are connected successfuly and there is no missing samples¶

In [27]:
attendance_dataset.iloc[::-1].iloc[:15]
Out[27]:
home_team away_team Attendance Date
963 France Mexico 4444 1930-07-13
962 United States Belgium 18346 1930-07-13
961 Romania Peru 2549 1930-07-14
960 Yugoslavia Brazil 24059 1930-07-14
959 Argentina France 23409 1930-07-15
958 Chile Mexico 9249 1930-07-16
957 United States Paraguay 18306 1930-07-17
956 Yugoslavia Bolivia 18306 1930-07-17
955 Uruguay Peru 57735 1930-07-18
954 Argentina Mexico 42100 1930-07-19
953 Chile France 2000 1930-07-19
952 Paraguay Belgium 12000 1930-07-20
951 Brazil Bolivia 25466 1930-07-20
950 Uruguay Romania 70022 1930-07-21
949 Argentina Chile 41459 1930-07-22
In [28]:
stadiums.loc[(stadiums.stadium_id=="S-193")|(stadiums.stadium_id=="S-192")|(stadiums.stadium_id=="S-191")][['stadium_capacity','stadium_id']]
Out[28]:
stadium_capacity stadium_id
190 90000 S-191
191 20000 S-192
192 10000 S-193
In [29]:
data[['home_team','away_team','Date','stadium_capacity','stadium_id','Attendance']].iloc[:15]
Out[29]:
home_team away_team Date stadium_capacity stadium_id Attendance
0 France Mexico 1930-07-13 10000 S-193 4444
1 Romania Peru 1930-07-14 10000 S-193 2549
2 United States Belgium 1930-07-13 20000 S-192 18346
3 Yugoslavia Brazil 1930-07-14 20000 S-192 24059
4 Argentina France 1930-07-15 20000 S-192 23409
5 Chile Mexico 1930-07-16 20000 S-192 9249
6 Yugoslavia Bolivia 1930-07-17 20000 S-192 18306
7 United States Paraguay 1930-07-17 20000 S-192 18306
8 Uruguay Peru 1930-07-18 90000 S-191 57735
9 Chile France 1930-07-19 90000 S-191 2000
10 Argentina Mexico 1930-07-19 90000 S-191 42100
11 Brazil Bolivia 1930-07-20 90000 S-191 25466
12 Paraguay Belgium 1930-07-20 90000 S-191 12000
13 Uruguay Romania 1930-07-21 90000 S-191 70022
14 Argentina Chile 1930-07-22 90000 S-191 41459

1_2:¶

In [30]:
players.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8485 entries, 0 to 8484
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   key_id                 8485 non-null   int64 
 1   player_id              8485 non-null   object
 2   family_name            8485 non-null   object
 3   given_name             8485 non-null   object
 4   birth_date             8484 non-null   object
 5   goal_keeper            8485 non-null   int64 
 6   defender               8485 non-null   int64 
 7   midfielder             8485 non-null   int64 
 8   forward                8485 non-null   int64 
 9   count_tournaments      8485 non-null   int64 
 10  list_tournaments       8485 non-null   object
 11  player_wikipedia_link  8485 non-null   object
dtypes: int64(6), object(6)
memory usage: 795.6+ KB
In [31]:
players_teams=players[['player_id','given_name','family_name','count_tournaments','list_tournaments']]
In [32]:
teams_name=[]
teams_code=[]
teams_nums=[]
def get_players_teams(labels):
    player_id=labels[0]
    player_appereance=squads.loc[squads.player_id==player_id]
    player_teams_name=player_appereance.team_name.unique().tolist()
    player_teams_code=player_appereance.team_code.unique().tolist()
    teams_name.append(player_teams_name)
    teams_code.append(player_teams_code)
    teams_nums.append(len(player_teams_name))
In [33]:
players_teams[['player_id']].apply(get_players_teams,axis=1)
players_teams['teams_name']=teams_name
players_teams['teams_code']=teams_code
players_teams['teams_nums']=teams_nums
D:\Programs\miniconda3\envs\BestEnv\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
D:\Programs\miniconda3\envs\BestEnv\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Programs\miniconda3\envs\BestEnv\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
In [34]:
players_teams[players_teams.teams_nums >1].iloc[:10]
Out[34]:
player_id given_name family_name count_tournaments list_tournaments teams_name teams_code teams_nums
331 P-01512 José Altafini 2 1958, 1962 [Brazil, Italy] [BRA, ITA] 2
937 P-01675 Thomas Berthold 3 1986, 1990, 1994 [West Germany, Germany] [DEU] 2
1039 P-03207 Alen Bokšić 2 1990, 2002 [Yugoslavia, Croatia] [YUG, HRV] 2
1087 P-05308 Aleksandr Borodyuk 2 1990, 1994 [Soviet Union, Russia] [SUN, RUS] 2
1166 P-00830 Andreas Brehme 3 1986, 1990, 1994 [West Germany, Germany] [DEU] 2
1227 P-05379 Guido Buchwald 2 1990, 1994 [West Germany, Germany] [DEU] 2
2037 P-02845 Attilio Demaría 2 1930, 1934 [Argentina, Italy] [ARG, ITA] 2
3036 P-06666 Sergei Gorlukovich 2 1990, 1994 [Soviet Union, Russia] [SUN, RUS] 2
3294 P-02040 Thomas Häßler 3 1990, 1994, 1998 [West Germany, Germany] [DEU] 2
3572 P-04641 Bodo Illgner 2 1990, 1994 [West Germany, Germany] [DEU] 2

2:¶

total_goals_in_match¶

In [35]:
def total_goals_in_match(labels):
    total=labels[0]+labels[1]
    return total;
In [36]:
data['total_goals_in_match']=data[['home_team_score','away_team_score']].apply(total_goals_in_match,axis=1)

match_for_host¶

In [37]:
def match_for_host(labels):
    home_team=labels[0]
    away_team=labels[1]
    host=labels[2]
    if home_team ==host:
        return True
    elif away_team==host:
        return True
    return False
In [38]:
data['match_for_host']=data[['home_team','away_team','country_name']].apply(match_for_host,axis=1)

used_capacity_ratio¶

In [39]:
def used_capacity_ratio(labels):
    capacity=labels[0]
    attendance=labels[1]
    return capacity/attendance
In [40]:
data['used_capacity_ratio']=data[['stadium_capacity','Attendance']].apply(used_capacity_ratio,axis=1)

attendance_category¶

In [41]:
data['attendance_category']=pd.qcut(data['Attendance'],q=4,labels=['Low',"Medium","High","Very High"])

relative_attendance_category¶

In [42]:
data['relative_attendance_category']=pd.qcut(data['used_capacity_ratio'],q=4,labels=['Very High',"High","Medium","Low"])
In [43]:
data[['Attendance','stadium_capacity','attendance_category','relative_attendance_category']]
Out[43]:
Attendance stadium_capacity attendance_category relative_attendance_category
0 4444 10000 Low Low
1 2549 10000 Low Low
2 18346 20000 Low Medium
3 24059 20000 Low Very High
4 23409 20000 Low Very High
... ... ... ... ...
959 41823 44000 Medium High
960 39789 44000 Medium Medium
961 41232 44000 Medium High
962 43443 44000 High High
963 42523 44000 Medium High

964 rows × 4 columns

host_country_code¶

i took the away team because it has every team that exist in home teams and more¶

In [44]:
name_code_dict={}
for name,code in zip(data['away_team'],data['away_team_code']):
    name_code_dict[name]=code
In [45]:
def host_country_code(labels):
    country_name=labels[0]
    return name_code_dict[country_name]
In [46]:
data['host_country_code']=data[['country_name']].apply(host_country_code,axis=1)

tournament_year¶

In [47]:
def tournament_year(labels):
    date=labels[0]
    d=datetime.datetime.strptime(date,'%Y-%m-%d')
    return d.year
In [48]:
data['tournament_year']=data[['Date']].apply(tournament_year,axis=1)

full_name¶

In [49]:
def full_name(labels):
    first_name=labels[0]
    last_name=labels[1]
    return " ".join([first_name,last_name])
In [50]:
players["full_name"]=players[['given_name','family_name']].apply(full_name,axis=1)

short_stage_name¶

In [51]:
def short_stage_name(labels):
    g=labels[0]
    k=labels[1]
    if g==1 :
        return "Group Match"
    elif k==1:
        return "Knockout Match"
In [52]:
data['short_stage_name']=data[['group_stage','knockout_stage']].apply(short_stage_name,axis=1)

winner_code¶

In [53]:
def winner_code(labels):
    winner_name=labels[0]
    return name_code_dict[winner_name]
In [54]:
tournaments['winner_code']=tournaments[['winner']].apply(winner_code,axis=1)

late_goal¶

In [55]:
last_goals=[]
def last_goal(labels):
    match_id=labels[0]
    last_goal=goals[goals.match_id==match_id]['minute_label'].tolist()
    if len(last_goal) ==0:
        last_goals.append(0)
        return 0
    return last_goal.pop()
In [56]:
data['last_goal']=data[['match_id']].apply(last_goal,axis=1)
In [57]:
def late_goal(labels):
    last_goal=labels[0]
    extra_time=labels[1]
    penalty_shootout=labels[2]
    if last_goal==0:
        return "No Late Goal"
    #no extra time like just 86'
    if len(last_goal)<4:
        if int(last_goal.split('\'')[0]) >85 and extra_time ==0:
            return "Late Goal"
        else:
            return "No Late Goal"
    # extra time like 90'+4',120'+2' etc..
    elif len(last_goal)>=4:
        if (int(last_goal.split('\'')[0])>=90 and extra_time==0) or (int(last_goal.split('\'')[0])>115 and penalty_shootout==0):
            return "Late Goal"
        else:
            return "No Late Goal"
        
    return "No Late Goal"
In [58]:
data['late_goal']=data[['last_goal','extra_time','penalty_shootout']].apply(late_goal,axis=1)
In [59]:
data.iloc[0:,-15:]
Out[59]:
result home_team_win away_team_win draw Attendance total_goals_in_match match_for_host used_capacity_ratio attendance_category relative_attendance_category host_country_code tournament_year short_stage_name last_goal late_goal
0 home team win 1 0 0 4444 5 False 2.250225 Low Low URY 1930 Group Match 87' Late Goal
1 home team win 1 0 0 2549 4 False 3.923107 Low Low URY 1930 Group Match 89' Late Goal
2 home team win 1 0 0 18346 3 False 1.090156 Low Medium URY 1930 Group Match 69' No Late Goal
3 home team win 1 0 0 24059 3 False 0.831290 Low Very High URY 1930 Group Match 62' No Late Goal
4 home team win 1 0 0 23409 1 False 0.854372 Low Very High URY 1930 Group Match 81' No Late Goal
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
959 away team win 0 1 0 41823 1 False 1.052053 Medium High QAT 2022 Group Match 23' No Late Goal
960 draw 0 0 1 39789 6 False 1.105833 Medium Medium QAT 2022 Group Match 66' No Late Goal
961 home team win 1 0 0 41232 1 False 1.067132 Medium High QAT 2022 Group Match 60' No Late Goal
962 away team win 0 1 0 43443 2 False 1.012821 High High QAT 2022 Group Match 32' No Late Goal
963 away team win 0 1 0 42523 2 False 1.034734 Medium High QAT 2022 Knockout Match 55' No Late Goal

964 rows × 15 columns

3_a¶

3_a_1¶

In [60]:
my_data = data[['Attendance','tournament_year']].groupby(['tournament_year']).mean().reset_index()
my_data = my_data.rename(columns = {"Attendance": 'mean'})
my_data['median'] = data[['Attendance','tournament_year']].groupby(['tournament_year']).median().reset_index().Attendance
fig = px.line(my_data,x='tournament_year',y=['mean','median'],title='Attendance Over The Years')
fig.show()

3_a_2¶

In [61]:
fig = px.histogram(data,x='Attendance',nbins=50)
fig.show()

3_a_3¶

In [62]:
fig = px.box(data,x='tournament_year',y='Attendance')
fig.show()

3_b¶

3_b_1¶

In [63]:
def convert_minute_label_to_int(labels):
    minute_label=labels[0]
    if len(minute_label)<=4:
        return int(minute_label.split('\'')[0])
    else:
        return int(minute_label.split('\'')[0])+ int(minute_label.split('\'')[1][1])
In [64]:
goals['minute_label_int']=goals[['minute_label']].apply(convert_minute_label_to_int,axis=1)
In [65]:
my_data = goals[['minute_label_int','tournament_id']].groupby(['tournament_id']).mean().reset_index()
my_data = my_data.rename(columns = {'minute_label_int':'mean'})
fig = px.bar(my_data,x='tournament_id',y='mean')
fig.show()

3_b_2¶

In [66]:
my_data = goals[['goal_id','match_id']].groupby(['match_id']).size().reset_index(name='count')
fig = px.histogram(my_data, y='count',x='match_id')
fig.show()

3_b_3¶

In [67]:
def get_most_repeated_goals(minute_labels):
    return minute_labels['minute_label_int'].mode()
In [68]:
my_data=goals[['tournament_id','minute_label','minute_label_int']].groupby(['tournament_id']).apply(get_most_repeated_goals).reset_index()
my_data.rename(columns={0:'most_repeated_minute'},inplace=True)
fig = px.scatter(my_data, y="most_repeated_minute",x='tournament_id')
fig.show()

3_b_4¶

In [69]:
my_data=data[['late_goal','tournament_id']].groupby(['tournament_id']).size().reset_index(name='count')
fig = px.histogram(my_data, y='count',x='tournament_id')
fig.show()

3_b_5¶

In [70]:
my_data=goals[['given_name','family_name','goal_id']].groupby(['given_name','family_name']).size().reset_index(name='count').sort_values('count').reset_index()
my_data = my_data.iloc[-12:]
fig = px.bar(my_data, y='count',x='family_name')
fig.show()

3_b_6¶

In [71]:
sepeated_data=goals.groupby(['tournament_id','given_name','family_name']).size().reset_index(name='count')
In [72]:
final_data=sepeated_data.groupby('tournament_id')
In [73]:
top_scorer={}
for name,group in final_data:
    group.reset_index(inplace=True)
    index=group['count'].idxmax()
    player_name=group['family_name'].iloc[index]
    top_scorer[name + ' / '+ player_name]=group['count'].max()
In [74]:
fig = px.bar(y=top_scorer.values(), x=top_scorer.keys())
fig.show()

3_b_7¶

In [75]:
my_data = goals.groupby(['tournament_id']).size().reset_index(name='count')
fig = px.bar(my_data, y='count',x='tournament_id')
fig.show()

3_b_8¶

In [76]:
#East Germany,West Germany , Germany , Brazil , Italy:
specific_teams=goals.loc[(goals.team_name=='East Germany') |(goals.team_name=='West Germany') | (goals.team_name=="Germany") | (goals.team_name == "Italy") | (goals.team_name=="Brazil")]
In [77]:
fig = px.strip(specific_teams, y='minute_label_int',x='stage_name')
fig.show()

3_c¶

3_c_1¶

In [78]:
all_matches={}
def get_most_repeated_matches(labels):
    home_team=labels[0]
    away_team=labels[1]
    full_team= home_team + ' vs ' + away_team
    if full_team not in all_matches:
        all_matches[full_team]=1
    else:
        all_matches[full_team]=all_matches[full_team]+1
In [79]:
matches[['home_team','away_team']].apply(get_most_repeated_matches,axis=1)
Out[79]:
0      None
1      None
2      None
3      None
4      None
       ... 
959    None
960    None
961    None
962    None
963    None
Length: 964, dtype: object

3_c_2¶

In [80]:
most_repeated_matchessorted=sorted(all_matches.items(),key=lambda x:x[1])[-10:]
In [81]:
repeated_matches=[x[1] for x in most_repeated_matchessorted[-12:]]
repeated_num=[x[0] for x in most_repeated_matchessorted[-12:]]

fig = px.bar(y=repeated_matches, x=repeated_num)
fig.show()

3_d¶

3_d_1¶

In [82]:
players_teams.loc[players_teams['teams_nums']>=2][players_teams.columns]
Out[82]:
player_id given_name family_name count_tournaments list_tournaments teams_name teams_code teams_nums
331 P-01512 José Altafini 2 1958, 1962 [Brazil, Italy] [BRA, ITA] 2
937 P-01675 Thomas Berthold 3 1986, 1990, 1994 [West Germany, Germany] [DEU] 2
1039 P-03207 Alen Bokšić 2 1990, 2002 [Yugoslavia, Croatia] [YUG, HRV] 2
1087 P-05308 Aleksandr Borodyuk 2 1990, 1994 [Soviet Union, Russia] [SUN, RUS] 2
1166 P-00830 Andreas Brehme 3 1986, 1990, 1994 [West Germany, Germany] [DEU] 2
1227 P-05379 Guido Buchwald 2 1990, 1994 [West Germany, Germany] [DEU] 2
2037 P-02845 Attilio Demaría 2 1930, 1934 [Argentina, Italy] [ARG, ITA] 2
3036 P-06666 Sergei Gorlukovich 2 1990, 1994 [Soviet Union, Russia] [SUN, RUS] 2
3294 P-02040 Thomas Häßler 3 1990, 1994, 1998 [West Germany, Germany] [DEU] 2
3572 P-04641 Bodo Illgner 2 1990, 1994 [West Germany, Germany] [DEU] 2
3719 P-04605 Robert Jarni 3 1990, 1998, 2002 [Yugoslavia, Croatia] [YUG, HRV] 2
4131 P-06604 Jürgen Klinsmann 3 1990, 1994, 1998 [West Germany, Germany] [DEU] 2
4155 P-08277 Jürgen Kohler 3 1990, 1994, 1998 [West Germany, Germany] [DEU] 2
4200 P-03556 Andreas Köpke 3 1990, 1994, 1998 [West Germany, Germany] [DEU] 2
4968 P-09502 Lothar Matthäus 5 1982, 1986, 1990, 1994, 1998 [West Germany, Germany] [DEU] 2
5200 P-09293 Savo Milošević 2 1998, 2006 [Yugoslavia, Serbia and Montenegro] [YUG, SCG] 2
5280 P-01103 Andreas Möller 3 1990, 1994, 1998 [West Germany, Germany] [DEU] 2
5304 P-02543 Luis Monti 2 1930, 1934 [Argentina, Italy] [ARG, ITA] 2
6319 P-02576 Robert Prosinečki 3 1990, 1998, 2002 [Yugoslavia, Croatia] [YUG, HRV] 2
6340 P-01259 Ferenc Puskás 2 1954, 1962 [Hungary, Spain] [HUN, ESP] 2
6396 P-01757 Rudolf Raftl 2 1934, 1938 [Austria, Germany] [AUT, DEU] 2
6506 P-06121 Stefan Reuter 2 1990, 1998 [West Germany, Germany] [DEU] 2
6545 P-07779 Karl-Heinz Riedle 2 1990, 1994 [West Germany, Germany] [DEU] 2
6903 P-05394 José Santamaría 2 1954, 1962 [Uruguay, Spain] [URY, ESP] 2
6974 P-09633 Willibald Schmaus 2 1934, 1938 [Austria, Germany] [AUT, DEU] 2
7353 P-02369 Dejan Stanković 3 1998, 2006, 2010 [Yugoslavia, Serbia and Montenegro, Serbia] [YUG, SCG, SRB] 3
7403 P-08793 Vladimir Stojković 3 2006, 2010, 2018 [Serbia and Montenegro, Serbia] [SCG, SRB] 2
7424 P-08975 Josef Stroh 2 1934, 1938 [Austria, Germany] [AUT, DEU] 2
7451 P-01739 Davor Šuker 3 1990, 1998, 2002 [Yugoslavia, Croatia] [YUG, HRV] 2
7620 P-07346 Olaf Thon 3 1986, 1990, 1998 [West Germany, Germany] [DEU] 2
8030 P-06299 Nemanja Vidić 2 2006, 2010 [Serbia and Montenegro, Serbia] [SCG, SRB] 2
8093 P-02452 Rudi Völler 3 1986, 1990, 1994 [West Germany, Germany] [DEU] 2
8131 P-00537 Franz Wagner 2 1934, 1938 [Austria, Germany] [AUT, DEU] 2
8443 P-02502 Nikola Žigić 2 2006, 2010 [Serbia and Montenegro, Serbia] [SCG, SRB] 2

the most obvious reasons we can get from the previous data is the political reasons like when west_germany and germany seperated and reunion or when Yugoslavia broke up into serveral contries : Bosnia and Herzegovina, Croatia, Macedonia, Montenegro, Serbia and Slovenia.so each player has to represent another team.¶

3_d_2¶

In [83]:
def host_winner(labels):
    host=labels[0]
    winner=labels[1]
    if host == winner:
        return 1
    else:
        return 0
In [84]:
tournaments['host_winner']=tournaments[['host_country','winner']].apply(host_winner,axis=1)
In [85]:
grouped_data = tournaments.groupby(['host_winner']).size().reset_index(name='count')
In [86]:
px.bar(x=['host_didnt_win','host_win'],y=[grouped_data['count'].tolist()])

This means that about half of the World Cups were won by the hosts So we can conclude that when the tournaments in your land and you are surrounded by your audience it will have huge effect on your performance in the field¶

3_d_3¶

In [87]:
grouped_home_data = data[['home_team','Attendance']].groupby(['home_team']).sum().reset_index()
grouped_home_data.rename(columns={'Attendance':'home_attendance'},inplace=True)
In [88]:
grouped_away_data = data[['away_team','Attendance']].groupby(['away_team']).sum().reset_index()
grouped_away_data.rename(columns={'Attendance':'away_attendance'},inplace=True)
In [89]:
grouped_team_data=pd.concat([grouped_home_data,grouped_away_data],axis=1)
In [90]:
grouped_team_data.dropna(inplace=True)
In [91]:
fig = px.line(grouped_team_data,x='home_team',y=['home_attendance','away_attendance'],title='home attendance vs away attendance')
fig.show()

from the previous figure we can conclude that the teams which has a high popularity like Brazil and Argentina it has a huge attendance difference when it play at home¶

3_d_4¶

When The Country is the host:¶

In [92]:
def get_match_for_host(labels):
    home_team=labels[0]
    away_team=labels[1]
    country=labels[2]
    if home_team==country or away_team == country:
        return True
    else:
        return False
In [93]:
host_is_playing=data[['home_team','away_team','country_name']].apply(get_match_for_host,axis=1)
In [94]:
host_data=data.loc[host_is_playing][['home_team','away_team','country_name','Attendance']]
In [95]:
host_attend=host_data.groupby('country_name').sum().reset_index()
In [96]:
host_attend['number_of_matches']=host_data.groupby('country_name').size().reset_index(name='number_of_mathces')["number_of_mathces"]

divide the attendance by the number of matches to get the true values¶

In [97]:
host_attend['mean_values']=host_attend['Attendance']//host_attend['number_of_matches']

When the same Counrtys is not the host:¶

In [98]:
def get_match_for_not_host(labels):
    home_team=labels[0]
    away_team=labels[1]
    country=labels[2]
    if home_team!=country and away_team != country and (home_team in data.country_name.unique().tolist() or away_team in data.country_name.unique().tolist()):
        return True
    else:
        return False
In [99]:
host_is_not_playing=data[['home_team','away_team','country_name']].apply(get_match_for_not_host,axis=1)
In [100]:
not_host_data=data.loc[host_is_not_playing][['home_team','away_team','country_name','Attendance']]
In [101]:
not_host_attend=not_host_data.groupby('country_name').sum().reset_index()
In [102]:
not_host_attend['number_of_matches']=not_host_data.groupby('country_name').size().reset_index(name='number_of_mathces')["number_of_mathces"]

divide the attendance by the number of matches to get the true values¶

In [103]:
not_host_attend['mean_values']=not_host_attend['Attendance']//not_host_attend['number_of_matches']

concat the host and not host mean atttendance¶

In [104]:
final_data=pd.DataFrame({'country':host_attend.country_name,'host_mean_values':host_attend.mean_values,'not_host_mean_values':not_host_attend.mean_values})
In [105]:
fig = px.line(final_data,x='country',y=['host_mean_values','not_host_mean_values'],title='host attendance vs not host| attendance')
fig.show()

as we can see there is a huge difference(Almost doubled) in the attendance when the country become the host¶

4_ Secret Mission:¶

see if the ages of the players has an impact of winning the tournament or not:¶

In [106]:
teams_players=pd.DataFrame(columns={'team_name','players_id','tournament_id'})
In [107]:
ids=[]
names=[]
tournament_ids=[]
for name,group in squads.groupby(['tournament_id',"team_name"]):
    group.reset_index()
    ids.append(group.player_id.tolist())
    names.append(name[1])
    tournament_ids.append(name[0])
In [108]:
teams_players['team_name']=names
teams_players['players_id']=ids
teams_players['tournament_id']=tournament_ids
In [109]:
teams_players.insert(2,'players_id',teams_players.pop('players_id'))
In [110]:
winner_team_players_ids=pd.DataFrame(columns=['tournament_id','winner_team','players_ids'])
In [111]:
teams=[]
ids=[]
tournament_ids=[]
for index,row in tournaments[['tournament_id','winner']].iterrows():
    tournament_id=row[0]
    team_name=row[1]
    id_players=teams_players.loc[(teams_players['tournament_id']==tournament_id) & (teams_players['team_name']==team_name)]['players_id'].tolist()
    teams.append(team_name)
    tournament_ids.append(tournament_id)
    ids.append(id_players[0])
In [112]:
winner_team_players_ids['tournament_id']=tournament_ids
winner_team_players_ids['winner_team']=teams
winner_team_players_ids['players_ids']=ids
In [113]:
def get_mean_ages(labels):
    ages=[]
    for player_id in labels[0]:
        birth=players.loc[players['player_id']==player_id]['birth_date'].tolist()[0]
        birth_year=int(birth.split('-')[0])
        tour_year=int(labels[1].split('-')[1])
        age=tour_year-birth_year
        ages.append(age)
    return np.array(ages).mean()
In [114]:
winner_team_players_ids['mean_ages']=winner_team_players_ids[['players_ids','tournament_id']].apply(get_mean_ages,axis=1)
In [115]:
winner_team_players_ids
Out[115]:
tournament_id winner_team players_ids mean_ages
0 WC-1930 Uruguay [P-04671, P-09960, P-00948, P-09427, P-01245, ... 26.681818
1 WC-1934 Italy [P-06155, P-08634, P-09236, P-04825, P-07663, ... 28.409091
2 WC-1938 Italy [P-05226, P-02232, P-08903, P-02773, P-07300, ... 26.500000
3 WC-1950 Uruguay [P-00352, P-01717, P-03413, P-09441, P-09730, ... 25.681818
4 WC-1954 West Germany [P-05844, P-08245, P-08247, P-03533, P-02287, ... 28.045455
5 WC-1958 Brazil [P-02577, P-08512, P-02385, P-02499, P-07958, ... 25.954545
6 WC-1962 Brazil [P-02385, P-02499, P-08400, P-09928, P-08419, ... 28.136364
7 WC-1966 England [P-00753, P-07792, P-00049, P-08377, P-06862, ... 27.227273
8 WC-1970 Brazil [P-08384, P-07366, P-08598, P-04742, P-06480, ... 25.045455
9 WC-1974 West Germany [P-07375, P-00117, P-02579, P-07938, P-08834, ... 27.318182
10 WC-1978 Argentina [P-07746, P-02797, P-09250, P-01875, P-01242, ... 26.272727
11 WC-1982 Italy [P-03649, P-02667, P-02782, P-06485, P-07960, ... 27.318182
12 WC-1986 Argentina [P-08578, P-09253, P-05640, P-04444, P-00625, ... 27.045455
13 WC-1990 West Germany [P-04641, P-06121, P-00830, P-08277, P-08719, ... 27.772727
14 WC-1994 Brazil [P-07055, P-07794, P-06428, P-00131, P-00431, ... 27.772727
15 WC-1998 France [P-08621, P-02758, P-08661, P-01388, P-09168, ... 27.454545
16 WC-2002 Brazil [P-00168, P-04884, P-05789, P-07081, P-04820, ... 26.739130
17 WC-2006 Italy [P-00677, P-09031, P-02751, P-00192, P-08335, ... 28.782609
18 WC-2010 Spain [P-08599, P-02614, P-08653, P-00806, P-09494, ... 26.304348
19 WC-2014 Germany [P-09934, P-05176, P-02378, P-08606, P-03098, ... 26.217391
20 WC-2018 France [P-09916, P-09949, P-07157, P-00846, P-02672, ... 26.086957
21 WC-2022 Argentina [P-00773, P-09642, P-02432, P-03476, P-03419, ... 27.807692
In [116]:
fig = px.bar(winner_team_players_ids,x='tournament_id',y='mean_ages')
fig.show()

we can see that all the winner teams has somthing in common which is the mean ages of it players is around '25'¶

In [ ]: